#!/usr/bin/perl

#=======================================================================
# u2h
# File ID: cab2ee1e-5d46-11df-8988-90e6ba3022ac
# Converts from UTF-8 charset to HTML numeric entities (&#x263A; and &#9786;).
#
# Character set: UTF-8
# ©opyleft STDyearDTS– Øyvind A. Holm <sunny@sunbase.org>
# License: GNU General Public License version 2 or later, see end of 
# file for legal stuff.
#=======================================================================

use strict;
use warnings;
use Getopt::Long;

local $| = 1;

our $Debug = 0;

our %Opt = (

    'ampersand' => 0,
    'debug' => 0,
    'decimal' => 0,
    'help' => 0,
    'invalid' => 0,
    'latin1' => 0,
    'standard' => 0,
    'verbose' => 0,
    'version' => 0,

);

our $progname = $0;
$progname =~ s/^.*\/(.*?)$/$1/;
our $VERSION = '0.00';

Getopt::Long::Configure('bundling');
GetOptions(

    'ampersand|a' => \$Opt{'ampersand'},
    'debug' => \$Opt{'debug'},
    'decimal|d' => \$Opt{'decimal'},
    'help|h' => \$Opt{'help'},
    'invalid|i' => \$Opt{'invalid'},
    'latin1|l' => \$Opt{'latin1'},
    'standard|s' => \$Opt{'standard'},
    'verbose|v+' => \$Opt{'verbose'},
    'version' => \$Opt{'version'},

) || die("$progname: Option error. Use -h for help.\n");

$Opt{'debug'} && ($Debug = 1);
$Opt{'help'} && usage(0);
if ($Opt{'version'}) {
    print_version();
    exit(0);
}

exit(main(%Opt));

sub main {
    # {{{
    my %Opt = @_;
    my $Retval = 0;

    my $amp_ent = $Opt{'decimal'} ? "&#38;" : "&#x26;";

    while (<>) {
        $Opt{'ampersand'} && s/&/$amp_ent/g;
        $Opt{'standard'} && s/([\x20-\x7F])/decode_char($1)/ge;
        s/([\xFC-\xFD][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge;
        s/([\xF8-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge;
        s/([\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge;
        s/([\xE0-\xEF][\x80-\xBF][\x80-\xBF])/decode_char($1)/ge;
        s/([\xC0-\xDF][\x80-\xBF])/decode_char($1)/ge;
        print;
    }

    return($Retval);
    # }}}
} # main()

sub decode_char {
	# {{{
	my $Msg = shift;
	my $Val = "";
	if ($Msg =~ /^([\x20-\x7F])$/) {
		$Val = ord($1);
	} elsif ($Msg =~ /^([\xC0-\xDF])([\x80-\xBF])/) {
		if (!$Opt{'invalid'} && $Msg =~ /^[\xC0-\xC1]/) {
			$Val = 0xFFFD;
		} else {
			$Val = ((ord($1) & 0x1F) << 6) | (ord($2) & 0x3F);
		}
	} elsif ($Msg =~ /^([\xE0-\xEF])([\x80-\xBF])([\x80-\xBF])/) {
		if (!$Opt{'invalid'} && $Msg =~ /^\xE0[\x80-\x9F]/) {
			$Val = 0xFFFD;
		} else {
			$Val = ((ord($1) & 0x0F) << 12) |
			       ((ord($2) & 0x3F) <<  6) |
			       ( ord($3) & 0x3F);
		}
	} elsif ($Msg =~ /^([\xF0-\xF7])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])/) {
		if (!$Opt{'invalid'} && $Msg =~ /^\xF0[\x80-\x8F]/) {
			$Val = 0xFFFD;
		} else {
			$Val = ((ord($1) & 0x07) << 18) |
			       ((ord($2) & 0x3F) << 12) |
			       ((ord($3) & 0x3F) <<  6) |
			       ( ord($4) & 0x3F);
		}
	} elsif ($Msg =~ /^([\xF8-\xFB])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])/) {
		if (!$Opt{'invalid'} && $Msg =~ /^\xF8[\x80-\x87]/) {
			$Val = 0xFFFD;
		} else {
			$Val = ((ord($1) & 0x03) << 24) |
			       ((ord($2) & 0x3F) << 18) |
			       ((ord($3) & 0x3F) << 12) |
			       ((ord($4) & 0x3F) <<  6) |
			       ( ord($5) & 0x3F);
		}
	} elsif ($Msg =~ /^([\xFC-\xFD])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])([\x80-\xBF])/) {
		if (!$Opt{'invalid'} && $Msg =~ /^\xFC[\x80-\x83]/) {
			$Val = 0xFFFD;
		} else {
			$Val = ((ord($1) & 0x01) << 30) |
			       ((ord($2) & 0x3F) << 24) |
			       ((ord($3) & 0x3F) << 18) |
			       ((ord($4) & 0x3F) << 12) |
			       ((ord($5) & 0x3F) <<  6) |
			       ( ord($6) & 0x3F);
		}
	}
	unless ($Opt{'invalid'}) {
		if (($Val >= 0xD800 && $Val <= 0xDFFF) || ($Val eq 0xFFFE) || ($Val eq 0xFFFF)) {
			$Val = 0xFFFD;
		}
	}
	return ($Opt{'latin1'} && ($Val <= 0xFF)) ? chr($Val) : sprintf(($Opt{'decimal'} ? "&#%u;" : "&#x%X;"), $Val);
	# }}}
} # decode_char()

sub print_version {
    # Print program version {{{
    print("$progname v$VERSION\n");
    return;
    # }}}
} # print_version()

sub usage {
    # Send the help message to stdout {{{
    my $Retval = shift;

    if ($Opt{'verbose'}) {
        print("\n");
        print_version();
    }
    print(<<"END");

Usage: $progname [options] [file [files [...]]]

Options:

  -a, --ampersand
    Convert Ampersand into entity.
  -d, --decimal
    Use decimal values.
  -h, --help
    Show this help.
  -i, --invalid
    Accept invalid sequences (overlong sequences and surrogates).
  -l, --latin1
    Convert U+0080 through U+00FF to latin-1 instead of entities.
  -s, --standard
    Also convert standard ascii U+0020 through U+007F.
  -v, --verbose
    Increase level of verbosity. Can be repeated.
  --version
    Print version information.
  --debug
    Print debugging messages.

END
    exit($Retval);
    # }}}
} # usage()

sub msg {
    # Print a status message to stderr based on verbosity level {{{
    my ($verbose_level, $Txt) = @_;

    if ($Opt{'verbose'} >= $verbose_level) {
        print(STDERR "$progname: $Txt\n");
    }
    return;
    # }}}
} # msg()

sub D {
    # Print a debugging message {{{
    $Debug || return;
    my @call_info = caller;
    chomp(my $Txt = shift);
    my $File = $call_info[1];
    $File =~ s#\\#/#g;
    $File =~ s#^.*/(.*?)$#$1#;
    print(STDERR "$File:$call_info[2] $$ $Txt\n");
    return('');
    # }}}
} # D()

__END__

# Plain Old Documentation (POD) {{{

=pod

=head1 LICENCE

This program is free software; you can redistribute it and/or modify it 
under the terms of the GNU General Public License as published by the 
Free Software Foundation; either version 2 of the License, or (at your 
option) any later version.

This program is distributed in the hope that it will be useful, but 
WITHOUT ANY WARRANTY; without even the implied warranty of 
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along 
with this program.
If not, see L<http://www.gnu.org/licenses/>.

=cut

# }}}

# vim: set fenc=UTF-8 ft=perl fdm=marker ts=4 sw=4 sts=4 et fo+=w :
